Assess genomic coverage of various Tbg isolates to VSG expressed in patients. Run bowtie using a variety of read lengths from sequenced T.b. gambiense genomes and calculate/compare coverage of assembled patient VSG from our study.
packages used:
Trim_Galore (version 0.5.0)
Trimmomatic (version 0.38)
Bowtie with the parameters -v 2 -a -S (version 1.1.1)
Bedtools (version 2.27.0)
samtools
# raw sequencing read files each uploaded to their own directory designated by strain name
# datasets provided as paired end reads concatenated into single fq file and analyzed as if single end
# we can iterate through each genome using their distinctive directory names with this bash script
for path in seqFiles/*; do
SAMPLEID="$(basename "${path}")"
# quality and adapter trim raw reads
trim_galore --dont_gzip -o seqFiles/$SAMPLEID/ seqFiles/$SAMPLEID/*.fq
# truncate reads to desired lengths
trimmomatic SE -threads 4 seqFiles/$SAMPLEID/*_trimmed.fq seqFiles/$SAMPLEID/20BP.fq CROP:20
trimmomatic SE -threads 4 seqFiles/$SAMPLEID/*_trimmed.fq seqFiles/$SAMPLEID/30BP.fq CROP:30
trimmomatic SE -threads 4 seqFiles/$SAMPLEID/*_trimmed.fq seqFiles/$SAMPLEID/50BP.fq CROP:50
# generated an indexed bowtie reference for Tbg Patient VSG called tbgPatientsbt
# run bowtie for each read query size
(bowtie -a --best -v 2 -S tbgPatientsbt seqFiles/$SAMPLEID/20BP.fq seqFiles/$SAMPLEID/20BP.bt.sam) 2> seqFiles/$SAMPLEID/20BPbowtieSE.txt
samtools view -bS seqFiles/$SAMPLEID/20BP.bt.sam -o seqFiles/$SAMPLEID/20BP.bt.bam
bedtools genomecov -ibam seqFiles/$SAMPLEID/20BP.bt.bam -bga > seqFiles/$SAMPLEID/genomecov.20BP.txt
(bowtie -a --best -v 2 -S tbgPatientsbt seqFiles/$SAMPLEID/30BP.fq seqFiles/$SAMPLEID/30BP.bt.sam) 2> seqFiles/$SAMPLEID/30BPbowtieSE.txt
samtools view -bS seqFiles/$SAMPLEID/30BP.bt.sam -o seqFiles/$SAMPLEID/30BP.bt.bam
bedtools genomecov -ibam seqFiles/$SAMPLEID/30BP.bt.bam -bga > seqFiles/$SAMPLEID/genomecov.30BP.txt
(bowtie -a --best -v 2 -S tbgPatientsbt seqFiles/$SAMPLEID/50BP.fq seqFiles/$SAMPLEID/50BP.bt.sam) 2> seqFseqFiles/$SAMPLEID/50BPbowtieSE.txt
samtools view -bS seqFiles/$SAMPLEID/50BP.bt.sam -o seqFiles/$SAMPLEID/50BP.bt.bam
bedtools genomecov -ibam seqFiles/$SAMPLEID/50BP.bt.bam -bga > seqFiles/$SAMPLEID/genomecov.50BP.txt
done
Use iRanges R package to collapse overlapping reads into single range of VSG gene coverage
Available in FigureData directory as compressed .zip files
# calculate percent coverage
calc_cov <- function(df){
pct_cov <- df %>% group_by(genome, new_name) %>%
summarize(aligned = sum(width)) %>%
full_join(df %>% select(new_name, seqlen) %>% distinct()) %>%
mutate(pct_coverage = (aligned / seqlen)*100)
return(pct_cov)
}
# merge with all the VSG with 0 hits
calc_cov_20bp <- calc_cov(bt20bp)
## `summarise()` has grouped output by 'genome'. You can override using the
## `.groups` argument.
## Joining, by = "new_name"
# no VSG had 0 coverage with 20bp reads
calc_cov_30bp <- calc_cov(bt30bp)
## `summarise()` has grouped output by 'genome'. You can override using the
## `.groups` argument.
## Joining, by = "new_name"
calc_cov_30bp <- rbind(calc_cov_30bp, nohits30bp %>%
mutate(aligned = 0) %>%
mutate(pct_coverage = 0) %>%
select(genome, new_name, aligned, seqlen, pct_coverage))
calc_cov_50bp <- calc_cov(bt50bp)
## `summarise()` has grouped output by 'genome'. You can override using the
## `.groups` argument.
## Joining, by = "new_name"
calc_cov_50bp <- rbind(calc_cov_50bp, nohits50bp %>%
mutate(aligned = 0) %>%
mutate(pct_coverage = 0) %>%
select(genome, new_name, aligned, seqlen, pct_coverage))
Compare coverage by domain
# focus on 30bp set for more in-depth analysis
Nbound <- read_csv("FigureData/AllPatient_orf_VSGs_merged-translated_NtermSummary.csv", show_col_types = FALSE) %>% select(original_seqID, nterm_seq_length) %>% mutate(nterm_coord = nterm_seq_length*3)
colnames(Nbound)[1] <- "seq_ID"
plot30bp <- merge(bt30bp[grepl("Gambiense", bt30bp$new_name), ], genomeMeta)
plot30bp <- left_join(plot30bp, Nbound, by = 'seq_ID') %>%
mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
year > "2004" ~ "DRC 2000's",
TRUE ~ "Cote d'Ivoire 2000's"))
# pick up any ranges that span the C-term/N-term boundary
DomSpans <-plot30bp[plot30bp$start <= plot30bp$nterm_coord & plot30bp$end >= plot30bp$nterm_coord, ] %>%
select(new_name, genome, start, end, nterm_coord, seqlen)
addtoNterm <- DomSpans %>% select(new_name, genome, start, nterm_coord) %>% dplyr::rename(end = nterm_coord) %>% mutate(width = (end - start)+1)
addtoCterm <- DomSpans %>% select(new_name, genome,nterm_coord, end) %>% dplyr::rename(start = nterm_coord) %>% mutate(width = (end - start)+1)
Nterm <- plot30bp[plot30bp$end <= plot30bp$nterm_coord, ] %>%
select(new_name, genome, start, end, width) %>%
rbind(.,addtoNterm) %>%
distinct() %>%
group_by(new_name, genome) %>%
summarize(aligned = sum(width)) %>%
merge(plot30bp %>% select(new_name, nterm_coord) %>% distinct()) %>%
mutate(pct_coverage = (aligned / nterm_coord)*100) %>%
select(new_name, genome, pct_coverage, aligned, nterm_coord) %>%
dplyr::rename(domain_length = nterm_coord)
## `summarise()` has grouped output by 'new_name'. You can override using the
## `.groups` argument.
# make entries for VSG with no hits
patientVSG <- unique(plot30bp$new_name)
genomes <- unique(bt30bp$genome)
add0Nhit <- data.frame(new_name = character(),
genome = character(),
pct_coverage = double())
for (i in 1:36){
y <- data.frame(new_name = setdiff(patientVSG, Nterm[Nterm$genome == genomes[i], ] %>% .$new_name)) %>%
mutate(genome = genomes[i]) %>%
mutate(pct_coverage = 0, aligned = 0)
add0Nhit <- rbind(add0Nhit, y)
}
add0Nhit <- inner_join(add0Nhit, plot30bp %>% select(new_name, nterm_coord)) %>% distinct() %>% dplyr::rename(domain_length = nterm_coord)
## Joining, by = "new_name"
Nterm <- rbind(Nterm, add0Nhit)
Cterm <- plot30bp[plot30bp$start >= plot30bp$nterm_coord, ] %>%
select(new_name, genome, start, end, width) %>%
rbind(.,addtoCterm) %>%
distinct() %>%
group_by(new_name, genome) %>%
summarize(aligned = sum(width)) %>%
merge(plot30bp %>% select(new_name, nterm_coord, seqlen) %>% distinct()) %>%
mutate(pct_coverage = (aligned / (seqlen - nterm_coord + 1))*100, domain_length = (seqlen - nterm_coord + 1)) %>%
select(new_name, genome, pct_coverage, aligned, domain_length)
## `summarise()` has grouped output by 'new_name'. You can override using the
## `.groups` argument.
# make entries for VSG with no hits
add0Chit <- data.frame(new_name = character(),
genome = character(),
pct_coverage = double())
for (i in 1:36){
y <- data.frame(new_name = setdiff(patientVSG, Cterm[Cterm$genome == genomes[i], ] %>% .$new_name)) %>%
mutate(genome = genomes[i]) %>%
mutate(pct_coverage = 0, aligned = 0)
add0Chit <- rbind(add0Chit, y)
}
add0Chit <- inner_join(add0Chit, plot30bp %>% select(new_name, nterm_coord, seqlen)) %>% mutate(domain_length = seqlen - nterm_coord +1) %>% distinct()
## Joining, by = "new_name"
add0Chit$domain_length <- replace(add0Chit$domain_length, add0Chit$domain_length<0, 0)
Cterm <- rbind(Cterm, add0Chit %>% select(-nterm_coord,-seqlen))
Pick a representative set for ease of viewing (141BT, BRAZO, B4_4163P), we can see from full density plots that most datasets tend to follow similar distribution.
Density plot showing the percentage of each of the patient VSG ORF sequence that had at least one whole genome sequencing read (30bp length) align for each of three representative whole genome datasets.
dist20bp <- calc_cov_20bp[grepl("Gambiense", calc_cov_20bp$new_name), ] %>%
inner_join(genomeMeta) %>%
mutate(dist_facets = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
year > "2004" ~ "DRC 2000's",
TRUE ~ "Cote d'Ivoire 2000's")) %>%
mutate(query_size = "20bp")
## Joining, by = "genome"
dist30bp <- calc_cov_30bp[grepl("Gambiense", calc_cov_30bp$new_name), ] %>%
inner_join(genomeMeta) %>%
mutate(dist_facets = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
year > "2004" ~ "DRC 2000's",
TRUE ~ "Cote d'Ivoire 2000's")) %>%
mutate(query_size = "30bp")
## Joining, by = "genome"
dist50bp <- calc_cov_50bp[grepl("Gambiense", calc_cov_50bp$new_name), ] %>%
inner_join(genomeMeta) %>%
mutate(dist_facets = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
year > "2004" ~ "DRC 2000's",
TRUE ~ "Cote d'Ivoire 2000's")) %>%
mutate(query_size = "50bp")
## Joining, by = "genome"
g1 <- ggplot(data = rbind(dist20bp[dist20bp$dist_facets == "Cote d'Ivoire 1980's", ], dist30bp[dist30bp$dist_facets == "Cote d'Ivoire 1980's", ], dist50bp[dist50bp$dist_facets == "Cote d'Ivoire 1980's", ]) %>% .[!.$genome == "tbgDAL972", ],
aes(x = pct_coverage, fill = genome)) +
geom_density(alpha = 0.4) +
scale_color_viridis(discrete = T) +
scale_fill_viridis(discrete = T) +
xlab("Percent Patient VSG Gene Coverage") +
ggtitle("Cote d'Ivoire 1980's") +
facet_wrap(~query_size, ncol = 1, scales = "free_y") +
scale_x_continuous(limits = c(0,100)) +
theme_bw()
g2 <- ggplot(data = rbind(dist20bp[dist20bp$dist_facets == "Cote d'Ivoire 2000's", ], dist30bp[dist30bp$dist_facets == "Cote d'Ivoire 2000's", ], dist50bp[dist50bp$dist_facets == "Cote d'Ivoire 2000's", ]),
aes(x = pct_coverage, fill = genome)) +
geom_density(alpha = 0.4) +
scale_color_viridis(discrete = T) +
scale_fill_viridis(discrete = T) +
xlab("Percent Patient VSG Gene Coverage") +
ggtitle("Cote d'Ivoire 2000's") +
facet_wrap(~query_size, ncol = 1, scales = "free_y") +
scale_x_continuous(limits = c(0,100)) +
theme_bw()
g3sum <- rbind(dist20bp[dist20bp$dist_facets == "DRC 2000's", ], dist30bp[dist30bp$dist_facets == "DRC 2000's", ], dist50bp[dist50bp$dist_facets == "DRC 2000's", ]) %>% group_by(query_size, genome) %>% summarize(pct_coverage = mean(pct_coverage))
## `summarise()` has grouped output by 'query_size'. You can override using the
## `.groups` argument.
g3 <- ggplot(data = rbind(dist20bp[dist20bp$dist_facets == "DRC 2000's", ], dist30bp[dist30bp$dist_facets == "DRC 2000's", ], dist50bp[dist50bp$dist_facets == "DRC 2000's", ]),
aes(x = pct_coverage, fill = genome)) +
# geom_vline(data = g3sum, aes(xintercept = pct_coverage, color = genome), linetype = "dashed") +
geom_density(alpha = 0.4) +
# scale_color_viridis(discrete = T) +
scale_fill_viridis(discrete = T) +
xlab("Percent Patient VSG Gene Coverage") +
ggtitle("DRC 2000's") +
facet_wrap(~query_size, ncol = 1, scales = "free_y") +
scale_x_continuous(limits = c(0,100)) +
theme_bw()
plot_grid(g1, g2, g3, ncol = 3)
distDomains <- rbind(Cterm %>% mutate(domain = 'C-Terminal Domain'), Nterm %>% mutate(domain = 'N-Terminal Domain')) %>%
convert_as_factor(domain, genome) %>%
merge(plot30bp %>% select(origin, year, genome) %>% distinct()) %>%
ungroup()
distDomains %>% mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
year > "2004" ~ "DRC 2000's",
TRUE ~ "Cote d'Ivoire 2000's")) %>%
.[.$genome == "141BT" | .$genome == "BRAZO" | .$genome == "B4-4163P", ] %>%
ggplot(aes(x = pct_coverage, fill = groups)) +
geom_density(alpha = 0.4) +
scale_color_viridis(discrete = T) +
scale_fill_viridis(discrete = T) +
xlab("Percent Patient VSG Gene Coverage") +
ggtitle("Domain Coverage Distribution") +
facet_wrap(~domain, ncol = 1, scales = "free_y") +
scale_x_continuous(limits = c(0,100)) +
theme_bw()
Plots comparing sequence representation within the patient VSG N-terminal and C-terminal domains for each group. Representation for each VSG is quantified as the proportion of nucleotides in each domain with at least one alignment to the total number of nucleotides in that domain, with the average representation of all VSGs for each genome shown. Significant differences between groups were determined using Kruskal-Wallis followed by a post-hoc Dunn’s test
# an absolute quantification of representation by each genome by number of bases represented
Represented <- rbind(Cterm %>% mutate(domain = "C-term"), Nterm %>% mutate(domain = "N-term")) %>% group_by(domain, genome) %>% summarise(totals = sum(domain_length), aligned = sum(aligned)) %>% mutate(proportion = aligned / totals)
## `summarise()` has grouped output by 'domain'. You can override using the
## `.groups` argument.
Represented <- merge(Represented, genomeMeta) %>%
mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
year > "2004" ~ "DRC 2000's",
TRUE ~ "Cote d'Ivoire 2000's"))
Rep_summary <- Represented %>% group_by(domain, groups) %>% summarise(stddev = sd(proportion), proportion = mean(proportion))
## `summarise()` has grouped output by 'domain'. You can override using the
## `.groups` argument.
Nplot <- ggplot(Represented[Represented$domain == "N-term", ], aes(x = groups, y = proportion, fill = groups)) + geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.8, position = position_dodge(1)) +
theme_bw() +
scale_fill_viridis(discrete = T) +
ggtitle("N-terminal Domain Representation") +
ylab("Nucleotides Aligned / Total VSG Nucleotides") +
xlab("") +
theme(legend.position = "none") +
geom_crossbar(data = Rep_summary[Rep_summary$domain == "N-term", ],
aes(ymin = proportion, ymax = proportion), size = 0.3, color = "black") +
geom_errorbar(data = Rep_summary[Rep_summary$domain == "N-term", ],
aes(ymin = (proportion - stddev), ymax = (proportion + stddev)), width = 0.3, color = "black") + scale_y_continuous(limits = c(0,0.2))
Cplot <- ggplot(Represented[Represented$domain == "C-term", ], aes(x = groups, y = proportion, fill = groups)) + geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.8, position = position_dodge(1)) +
theme_bw() +
scale_fill_viridis(discrete = T) +
ggtitle("C-terminal Domain Representation") +
ylab("Nucleotides Aligned / Total VSG Nucleotides") +
xlab("") +
theme(legend.position = "none") +
geom_crossbar(data = Rep_summary[Rep_summary$domain == "C-term", ],
aes(ymin = proportion, ymax = proportion), size = 0.3, color = "black") +
geom_errorbar(data = Rep_summary[Rep_summary$domain == "C-term", ],
aes(ymin = (proportion - stddev), ymax = (proportion + stddev)), width = 0.3, color = "black") + scale_y_continuous(limits = c(0.5,0.85))
plot_grid(Nplot, Cplot, ncol = 2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
N-terms
kruskal_test(Represented[Represented$domain == "N-term", ], proportion ~ groups)
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <int> <dbl> <chr>
## 1 proportion 36 17.6 2 0.000151 Kruskal-Wallis
dunn_test(Represented[Represented$domain == "N-term", ], proportion ~ groups, p.adjust.method = "holm")
## # A tibble: 3 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 proportion Cote d'I… Cote … 12 12 0.0194 9.85e-1 9.85e-1 ns
## 2 proportion Cote d'I… DRC 2… 12 12 3.64 2.70e-4 8.10e-4 ***
## 3 proportion Cote d'I… DRC 2… 12 12 3.62 2.91e-4 8.10e-4 ***
C-terms
kruskal_test(Represented[Represented$domain == "C-term", ], proportion ~ groups)
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <int> <dbl> <chr>
## 1 proportion 36 15.2 2 0.000498 Kruskal-Wallis
dunn_test(Represented[Represented$domain == "C-term", ], proportion ~ groups, p.adjust.method = "holm")
## # A tibble: 3 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 proportion Cote d'I… Cote … 12 12 -0.213 8.31e-1 0.831 ns
## 2 proportion Cote d'I… DRC 2… 12 12 3.27 1.09e-3 0.00218 **
## 3 proportion Cote d'I… DRC 2… 12 12 3.48 5.04e-4 0.00151 **
Make a map of coverage across each VSG ORF. Regions with at least one alignment from any of the 36 genomic datasets are shown in gray.
#read phyre2 output metadata file
##these are the top hits
phyre2 <- read_delim(file = "FigureData/Phyre2_summaryinfo.txt", delim = "|", show_col_types = FALSE)
colnames(phyre2)[1] <- "seq_ID"
# remove whitespace
phyre2$seq_ID <- gsub(" ", "", phyre2$seq_ID)
# join by cluster ref
phyre2 <- inner_join(phyre2, cluster_ref, by = "seq_ID")
# find regions of no homology, 0 alignments by any genome
noSimilarity <- data.frame(start = double(),
end = double(),
width = double(),
seq_ID = character())
for (i in 1:length(unique(plot30bp$seq_ID))) {
irR <- IRanges(plot30bp %>%
.[.$seq_ID == unique(plot30bp$seq_ID)[i], ] %>%
select(seq_ID, start, end) %>%
distinct() %>%
.$start,
end = plot30bp %>%
.[.$seq_ID == unique(plot30bp$seq_ID)[i], ] %>%
select(seq_ID, start, end) %>%
distinct() %>%
.$end)
# collapse overlapping regions to calculate VSG coverage
cov <- IRanges::reduce(irR) %>% as.data.frame()
cov <- cov %>% mutate(seq_ID = unique(plot30bp$seq_ID)[i])
noSimilarity <- rbind(noSimilarity, cov)
}
noSimilarity <- inner_join(noSimilarity, phyre2, by = "seq_ID") %>%
merge(plot30bp %>% select(new_name, seqlen, nterm_coord) %>% distinct())
noSimilarity %>% ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = 0, ymax = 1)) +
scale_fill_viridis(discrete = T) +
facet_wrap(~new_name) +
xlab("Position (bp)") +
ggtitle("30bp query") +
theme_bw() +
geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "blue", size = 0.6) +
geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "red", size = 0.6)
noSimilarity[noSimilarity$new_name == "Gambiense 248", ] %>% ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = 0, ymax = 1)) +
facet_wrap(~new_name, nrow = 1) +
xlab("Position (bp)") +
theme_bw() +
geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "royalblue", size = 0.8) +
geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "palegreen", size = 0.8)
noSimilarity[noSimilarity$new_name == "Gambiense 400", ] %>% ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = 0, ymax = 1)) +
facet_wrap(~new_name, nrow = 1) +
xlab("Position (bp)") +
theme_bw() +
geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "royalblue", size = 0.8) +
geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "palegreen", size = 0.8) + scale_x_continuous(limits = c(0,1500))
noSimilarity[noSimilarity$new_name == "Gambiense 452", ] %>% ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = 0, ymax = 1)) +
facet_wrap(~new_name, nrow = 1) +
xlab("Position (bp)") +
theme_bw() +
geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "royalblue", size = 0.8) +
geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "palegreen", size = 0.8) +
scale_x_continuous(limits = c(0,1500))
Similarity_cov <- noSimilarity %>% group_by(new_name) %>%
summarize(aligned = sum(width)) %>%
full_join(bt30bp %>% select(new_name, seqlen) %>% distinct()) %>%
mutate(pct_coverage = (aligned / seqlen)*100)
## Joining, by = "new_name"
Similarity_cov
## # A tibble: 57 × 4
## new_name aligned seqlen pct_coverage
## <chr> <int> <dbl> <dbl>
## 1 Gambiense 114 768 1491 51.5
## 2 Gambiense 119 342 1491 22.9
## 3 Gambiense 169 246 1473 16.7
## 4 Gambiense 173 258 1473 17.5
## 5 Gambiense 175 316 1473 21.5
## 6 Gambiense 195 363 1464 24.8
## 7 Gambiense 2 813 1665 48.8
## 8 Gambiense 218 559 1458 38.3
## 9 Gambiense 234 538 1455 37.0
## 10 Gambiense 238 349 1455 24.0
## # … with 47 more rows
plot20bp <- inner_join(bt20bp, genomeMeta, by = "genome") %>%
mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
year > "2004" ~ "DRC 2000's",
TRUE ~ "Cote d'Ivoire 2000's"))
nohits30bp <- nohits30bp %>% rename("depth" = "width") %>% mutate(end = 0)
supp_plot30bp <- rbind(bt30bp, nohits30bp) %>%
inner_join(., genomeMeta, by = "genome") %>%
mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
year > "2004" ~ "DRC 2000's",
TRUE ~ "Cote d'Ivoire 2000's"))
nohits50bp <- nohits50bp %>% rename("depth" = "width") %>% mutate(end = 0)
plot50bp <- rbind(bt50bp, nohits50bp) %>%
inner_join(., genomeMeta, by = "genome") %>%
mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
year > "2004" ~ "DRC 2000's",
TRUE ~ "Cote d'Ivoire 2000's"))
p1 <- plot20bp[!(grepl("Gambiense", plot20bp$new_name)) & plot20bp$groups == "Cote d'Ivoire 1980's" & !(plot20bp$new_name == "Tbg_LiTat1.5"), ] %>%
group_by(groups) %>%
mutate(plotbins = as.factor(genome) %>% as.numeric()) %>%
ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = plotbins, ymax = plotbins + 0.9,
fill = genome), show.legend = T) +
scale_fill_viridis(discrete = T) +
facet_wrap(~new_name, scales = "free_x", ncol = 3) +
xlab("Position (bp)") +
ggtitle("Cote d'Ivoire 1980's Isolates: 20bp query < 2 mismatches") +
theme_bw()
p2 <- plot20bp[!(grepl("Gambiense", plot20bp$new_name)) & plot20bp$groups == "Cote d'Ivoire 2000's" & !(plot20bp$new_name == "Tbg_LiTat1.5"), ] %>%
group_by(groups) %>%
mutate(plotbins = as.factor(genome) %>% as.numeric()) %>%
ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = plotbins, ymax = plotbins + 0.9,
fill = genome), show.legend = T) +
scale_fill_viridis(discrete = T) +
facet_wrap(~new_name, scales = "free_x", ncol = 3) +
xlab("Position (bp)") +
ggtitle("Cote d'Ivoire 2000's Isolates: 20bp query < 2 mismatches") +
theme_bw()
p3 <- plot20bp[!(grepl("Gambiense", plot20bp$new_name)) & plot20bp$groups == "DRC 2000's" & !(plot20bp$new_name == "Tbg_LiTat1.5"), ] %>%
group_by(groups) %>%
mutate(plotbins = as.factor(genome) %>% as.numeric()) %>%
ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = plotbins, ymax = plotbins + 0.9,
fill = genome), show.legend = T) +
scale_fill_viridis(discrete = T) +
facet_wrap(~new_name, scales = "free_x", ncol = 3) +
xlab("Position (bp)") +
ggtitle("DRC 2000's isolates: 20bp query < 2 mismatches") +
theme_bw()
plot_grid(p1,p2,p3, ncol = 3)
p4 <- supp_plot30bp[!(grepl("Gambiense", supp_plot30bp$new_name)) & supp_plot30bp$groups == "Cote d'Ivoire 1980's" & !(supp_plot30bp$new_name == "Tbg_LiTat1.5"), ] %>%
group_by(groups) %>%
mutate(plotbins = as.factor(genome) %>% as.numeric()) %>%
ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = plotbins, ymax = plotbins + 0.9,
fill = genome), show.legend = T) +
scale_fill_viridis(discrete = T) +
facet_wrap(~new_name, scales = "free_x", ncol = 3, drop = FALSE) +
xlab("Position (bp)") +
ggtitle("Cote d'Ivoire 1980's Isolates: 30bp query < 2 mismatches") +
theme_bw()
p5 <- supp_plot30bp[!(grepl("Gambiense", supp_plot30bp$new_name)) & supp_plot30bp$groups == "Cote d'Ivoire 2000's" & !(supp_plot30bp$new_name == "Tbg_LiTat1.5"), ] %>%
group_by(groups) %>%
mutate(plotbins = as.factor(genome) %>% as.numeric()) %>%
ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = plotbins, ymax = plotbins + 0.9,
fill = genome), show.legend = T) +
scale_fill_viridis(discrete = T) +
facet_wrap(~new_name, scales = "free_x", ncol = 3, drop = FALSE) +
xlab("Position (bp)") +
ggtitle("Cote d'Ivoire 2000's Isolates: 30bp query < 2 mismatches") +
theme_bw()
p6 <- supp_plot30bp[!(grepl("Gambiense", supp_plot30bp$new_name)) & supp_plot30bp$groups == "DRC 2000's" & !(supp_plot30bp$new_name == "Tbg_LiTat1.5"), ] %>%
group_by(groups) %>%
mutate(plotbins = as.factor(genome) %>% as.numeric()) %>%
ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = plotbins, ymax = plotbins + 0.9,
fill = genome), show.legend = T) +
scale_fill_viridis(discrete = T) +
facet_wrap(~new_name, scales = "free_x", ncol = 3, drop = FALSE) +
xlab("Position (bp)") +
ggtitle("DRC 2000's Isolates: 30bp query < 2 mismatches") +
theme_bw()
plot_grid(p4, p5, p6, ncol = 3)
# pull % coverage values for controls
mapCTl20bp <- calc_cov_20bp[!(grepl("Gambiense", calc_cov_20bp$new_name)), ]
mapCTl30bp <- calc_cov_30bp[!(grepl("Gambiense", calc_cov_30bp$new_name)), ]
# calculate average coverage for each control gene
mapCTl20bp %>% group_by(new_name) %>% summarise(mean_cov_20bp = mean(pct_coverage))
## # A tibble: 13 × 2
## new_name mean_cov_20bp
## <chr> <dbl>
## 1 Caeel_Ric-3 81.6
## 2 Caeel_Sol1 77.1
## 3 Caeel_zyg11 68.7
## 4 Drome_fas3 72.3
## 5 Drome_LBR 60.3
## 6 Drome_Wee1 66.8
## 7 K12_enolase 56.3
## 8 K12_ftsZ 57.8
## 9 K12_TufA 55.8
## 10 Tbg_GAPDH 100.
## 11 Tbg_LiTat1.3 99.4
## 12 Tbg_LiTat1.5 99.3
## 13 Tbg_TGSGP 99.5
mapCTl20bp[!(grepl("Tbg", mapCTl20bp$new_name)), ] %>% group_by(new_name) %>% summarise(mean_cov_20bp = mean(pct_coverage)) %>% .$mean_cov_20bp %>% mean()
## [1] 66.30604
mapCTl30bp %>% group_by(new_name) %>% summarise(mean_cov_30bp = mean(pct_coverage))
## # A tibble: 13 × 2
## new_name mean_cov_30bp
## <chr> <dbl>
## 1 Caeel_Ric-3 0.475
## 2 Caeel_Sol1 1.89
## 3 Caeel_zyg11 0.984
## 4 Drome_fas3 1.34
## 5 Drome_LBR 0.857
## 6 Drome_Wee1 0.486
## 7 K12_enolase 2.05
## 8 K12_ftsZ 1.45
## 9 K12_TufA 3.18
## 10 Tbg_GAPDH 100.
## 11 Tbg_LiTat1.3 96.2
## 12 Tbg_LiTat1.5 98.5
## 13 Tbg_TGSGP 97.7
mapCTl30bp[!(grepl("Tbg", mapCTl30bp$new_name)), ] %>% group_by(new_name) %>% summarise(mean_cov_30bp = mean(pct_coverage)) %>% .$mean_cov_30bp %>% mean()
## [1] 1.413298
Summary of Bowtie alignment hits for each assembled gHAT patient VSG against the genomic sequences.
Base-pair coordinates of each patient VSG is plotted as the X-axis, and each facet designates the patient VSG as well as the full ORF sequence length. Bars color-coded by genome dataset group show alignment length and position within the VSG ORF sequence for genomic sequence fragments of 30bp in length.
# make plot of ranges by genome group. Collapse ranges for all genomes isolated in same year/region
CDI1980 <- plot30bp[plot30bp$year < "1999", ]
CDI2000 <- plot30bp[plot30bp$year > "1999" & plot30bp$year < "2005", ]
DRC2000 <- plot30bp[plot30bp$year > "2004", ]
CDI1980.noSimilarity <- data.frame(start = double(),
end = double(),
width = double(),
seq_ID = character())
for (i in 1:length(unique(CDI1980$seq_ID))) {
irR <- IRanges(CDI1980 %>%
.[.$seq_ID == unique(CDI1980$seq_ID)[i], ] %>%
select(seq_ID, start, end) %>%
distinct() %>%
.$start,
end = CDI1980 %>%
.[.$seq_ID == unique(CDI1980$seq_ID)[i], ] %>%
select(seq_ID, start, end) %>%
distinct() %>%
.$end)
# collapse overlapping regions to calculate VSG coverage
cov <- IRanges::reduce(irR) %>% as.data.frame()
cov <- cov %>% mutate(seq_ID = unique(CDI1980$seq_ID)[i])
CDI1980.noSimilarity <- rbind(CDI1980.noSimilarity, cov)
}
CDI2000.noSimilarity <- data.frame(start = double(),
end = double(),
width = double(),
seq_ID = character())
for (i in 1:length(unique(CDI2000$seq_ID))) {
irR <- IRanges(CDI2000 %>%
.[.$seq_ID == unique(CDI2000$seq_ID)[i], ] %>%
select(seq_ID, start, end) %>%
distinct() %>%
.$start,
end = CDI2000 %>%
.[.$seq_ID == unique(CDI2000$seq_ID)[i], ] %>%
select(seq_ID, start, end) %>%
distinct() %>%
.$end)
# collapse overlapping regions to calculate VSG coverage
cov <- IRanges::reduce(irR) %>% as.data.frame()
cov <- cov %>% mutate(seq_ID = unique(CDI2000$seq_ID)[i])
CDI2000.noSimilarity <- rbind(CDI2000.noSimilarity, cov)
}
DRC2000.noSimilarity <- data.frame(start = double(),
end = double(),
width = double(),
seq_ID = character())
for (i in 1:length(unique(DRC2000$seq_ID))) {
irR <- IRanges(DRC2000 %>%
.[.$seq_ID == unique(DRC2000$seq_ID)[i], ] %>%
select(seq_ID, start, end) %>%
distinct() %>%
.$start,
end = DRC2000 %>%
.[.$seq_ID == unique(DRC2000$seq_ID)[i], ] %>%
select(seq_ID, start, end) %>%
distinct() %>%
.$end)
# collapse overlapping regions to calculate VSG coverage
cov <- IRanges::reduce(irR) %>% as.data.frame()
cov <- cov %>% mutate(seq_ID = unique(DRC2000$seq_ID)[i])
DRC2000.noSimilarity <- rbind(DRC2000.noSimilarity, cov)
}
GroupRanges <- rbind(CDI1980.noSimilarity %>%
mutate(group = "Cote d'Ivoire 1980's"),
CDI2000.noSimilarity %>%
mutate(group = "Cote d'Ivoire 2000's"),
DRC2000.noSimilarity %>% mutate(group = "DRC 2000's")) %>%
mutate(plotbins = as.factor(.$group) %>% as.numeric()) %>%
inner_join(phyre2, by = "seq_ID") %>%
merge(plot30bp %>% select(new_name, nterm_coord, seqlen) %>% unique())
GroupRanges %>% ggplot() +
geom_rect(aes(xmin = start, xmax = end,
ymin = plotbins, ymax = plotbins + 0.9,
fill = group), show.legend = T) +
scale_fill_viridis(discrete = T) +
geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "blue", size = 0.6) +
geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "red", size = 0.6) +
facet_wrap(~new_name) +
xlab("Position (bp)") +
ggtitle("Position and Range of Bowtie Alignments: 30bp query < 2 mismatches") +
theme_bw()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] rstatix_0.7.0 cowplot_1.1.1 colourvalues_0.3.7
## [4] viridis_0.6.2 viridisLite_0.4.0 IRanges_2.26.0
## [7] S4Vectors_0.30.2 BiocGenerics_0.38.0 forcats_0.5.1
## [10] stringr_1.4.0 dplyr_1.0.9 purrr_0.3.4
## [13] readr_2.1.2 tidyr_1.2.0 tibble_3.1.7
## [16] ggplot2_3.3.6 tidyverse_1.3.1 reshape2_1.4.4
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.3 sass_0.4.1 bit64_4.0.5 vroom_1.5.7
## [5] jsonlite_1.8.0 carData_3.0-5 modelr_0.1.8 bslib_0.3.1
## [9] assertthat_0.2.1 highr_0.9 cellranger_1.1.0 yaml_2.3.5
## [13] pillar_1.7.0 backports_1.4.1 glue_1.6.2 digest_0.6.29
## [17] rvest_1.0.2 colorspace_2.0-3 htmltools_0.5.2 plyr_1.8.7
## [21] pkgconfig_2.0.3 broom_0.8.0 haven_2.5.0 scales_1.2.0
## [25] tzdb_0.3.0 farver_2.1.0 generics_0.1.2 car_3.1-0
## [29] ellipsis_0.3.2 withr_2.5.0 cli_3.3.0 magrittr_2.0.3
## [33] crayon_1.5.1 readxl_1.4.0 evaluate_0.15 fs_1.5.2
## [37] fansi_1.0.3 xml2_1.3.3 tools_4.1.1 hms_1.1.1
## [41] lifecycle_1.0.1 munsell_0.5.0 reprex_2.0.1 compiler_4.1.1
## [45] jquerylib_0.1.4 rlang_1.0.2 grid_4.1.1 rstudioapi_0.13
## [49] labeling_0.4.2 rmarkdown_2.14 gtable_0.3.0 abind_1.4-5
## [53] DBI_1.1.3 R6_2.5.1 gridExtra_2.3 lubridate_1.8.0
## [57] knitr_1.39 bit_4.0.4 fastmap_1.1.0 utf8_1.2.2
## [61] stringi_1.7.6 Rcpp_1.0.8.3 vctrs_0.4.1 dbplyr_2.2.0
## [65] tidyselect_1.1.2 xfun_0.31
By Jaime So for the Mugnier Lab
jso4@jhmi.edu